{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "de6537c4",
   "metadata": {},
   "source": [
    "# BatchEvalRunner - Running Multiple Evaluations\n",
    "\n",
    "The `BatchEvalRunner` class can be used to run a series of evaluations asynchronously. The async jobs are limited to a defined size of `num_workers`.\n",
    "\n",
    "## Setup"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "db405f59",
   "metadata": {},
   "outputs": [],
   "source": [
    "%pip install llama-index-llms-openai"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4a8304f2",
   "metadata": {},
   "outputs": [],
   "source": [
    "# attach to the same event-loop\n",
    "import nest_asyncio\n",
    "\n",
    "nest_asyncio.apply()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ff59a851",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import openai\n",
    "\n",
    "os.environ[\"OPENAI_API_KEY\"] = \"sk-...\"\n",
    "# openai.api_key = os.environ[\"OPENAI_API_KEY\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8d0b2364-4806-4656-81e7-3f6e4b910b5b",
   "metadata": {},
   "outputs": [],
   "source": [
    "from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Response\n",
    "from llama_index.llms.openai import OpenAI\n",
    "from llama_index.core.evaluation import (\n",
    "    FaithfulnessEvaluator,\n",
    "    RelevancyEvaluator,\n",
    "    CorrectnessEvaluator,\n",
    ")\n",
    "from llama_index.core.node_parser import SentenceSplitter\n",
    "import pandas as pd\n",
    "\n",
    "pd.set_option(\"display.max_colwidth\", 0)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "efe66f2a",
   "metadata": {},
   "source": [
    "Using GPT-4 here for evaluation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b9b98f89-d5b8-4d29-92f6-ad76d5060e9f",
   "metadata": {},
   "outputs": [],
   "source": [
    "# gpt-4\n",
    "gpt4 = OpenAI(temperature=0, model=\"gpt-4\")\n",
    "\n",
    "faithfulness_gpt4 = FaithfulnessEvaluator(llm=gpt4)\n",
    "relevancy_gpt4 = RelevancyEvaluator(llm=gpt4)\n",
    "correctness_gpt4 = CorrectnessEvaluator(llm=gpt4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1298bbb4-c99e-431e-93ef-eb32c0a2fc2a",
   "metadata": {},
   "outputs": [],
   "source": [
    "documents = SimpleDirectoryReader(\"./test_wiki_data/\").load_data()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "41f0e53f-77a6-40d5-94ae-3f81b01af75c",
   "metadata": {},
   "outputs": [],
   "source": [
    "# create vector index\n",
    "llm = OpenAI(temperature=0.3, model=\"gpt-3.5-turbo\")\n",
    "splitter = SentenceSplitter(chunk_size=512)\n",
    "vector_index = VectorStoreIndex.from_documents(\n",
    "    documents, transformations=[splitter]\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "400f486d",
   "metadata": {},
   "source": [
    "## Question Generation\n",
    "\n",
    "To run evaluations in batch, you can create the runner and then call the `.aevaluate_queries()` function on a list of queries.\n",
    "\n",
    "First, we can generate some questions and then run evaluation on them."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "976e0a93",
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip install spacy datasets span-marker scikit-learn"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e31e10e6",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/loganm/llama_index_proper/llama_index/llama_index/evaluation/dataset_generation.py:187: DeprecationWarning: Call to deprecated class DatasetGenerator. (Deprecated in favor of `RagDatasetGenerator` which should be used instead.)\n",
      "  return cls(\n",
      "/home/loganm/llama_index_proper/llama_index/llama_index/evaluation/dataset_generation.py:282: DeprecationWarning: Call to deprecated class QueryResponseDataset. (Deprecated in favor of `LabelledRagDataset` which should be used instead.)\n",
      "  return QueryResponseDataset(queries=queries, responses=responses_dict)\n"
     ]
    }
   ],
   "source": [
    "from llama_index.core.evaluation import DatasetGenerator\n",
    "\n",
    "dataset_generator = DatasetGenerator.from_documents(documents, llm=llm)\n",
    "\n",
    "qas = dataset_generator.generate_dataset_from_nodes(num=3)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f3d6861e",
   "metadata": {},
   "source": [
    "## Running Batch Evaluation\n",
    "\n",
    "Now, we can run our batch evaluation!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "180a5d2e-9286-477b-9cd0-a5976d18d845",
   "metadata": {},
   "outputs": [],
   "source": [
    "from llama_index.core.evaluation import BatchEvalRunner\n",
    "\n",
    "runner = BatchEvalRunner(\n",
    "    {\"faithfulness\": faithfulness_gpt4, \"relevancy\": relevancy_gpt4},\n",
    "    workers=8,\n",
    ")\n",
    "\n",
    "eval_results = await runner.aevaluate_queries(\n",
    "    vector_index.as_query_engine(llm=llm), queries=qas.questions\n",
    ")\n",
    "\n",
    "# If we had ground-truth answers, we could also include the correctness evaluator like below.\n",
    "# The correctness evaluator depends on additional kwargs, which are passed in as a dictionary.\n",
    "# Each question is mapped to a set of kwargs\n",
    "#\n",
    "\n",
    "# runner = BatchEvalRunner(\n",
    "#     {\"correctness\": correctness_gpt4},\n",
    "#     workers=8,\n",
    "# )\n",
    "\n",
    "# eval_results = await runner.aevaluate_queries(\n",
    "#     vector_index.as_query_engine(),\n",
    "#     queries=qas.queries,\n",
    "#     reference=[qr[1] for qr in qas.qr_pairs],\n",
    "# )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0eff6823",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "3\n"
     ]
    }
   ],
   "source": [
    "print(len([qr for qr in qas.qr_pairs]))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b256b98c",
   "metadata": {},
   "source": [
    "## Inspecting Outputs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a4ab78d0",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "dict_keys(['correctness'])\n",
      "dict_keys(['query', 'contexts', 'response', 'passing', 'feedback', 'score', 'pairwise_source'])\n",
      "False\n",
      "The context information does not provide any information related to the query. Therefore, I cannot provide an answer based on the given context.\n",
      "None\n"
     ]
    }
   ],
   "source": [
    "print(eval_results.keys())\n",
    "\n",
    "print(eval_results[\"correctness\"][0].dict().keys())\n",
    "\n",
    "print(eval_results[\"correctness\"][0].passing)\n",
    "print(eval_results[\"correctness\"][0].response)\n",
    "print(eval_results[\"correctness\"][0].contexts)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d8b20df9",
   "metadata": {},
   "source": [
    "## Reporting Total Scores"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "35fdbe0f",
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_eval_results(key, eval_results):\n",
    "    results = eval_results[key]\n",
    "    correct = 0\n",
    "    for result in results:\n",
    "        if result.passing:\n",
    "            correct += 1\n",
    "    score = correct / len(results)\n",
    "    print(f\"{key} Score: {score}\")\n",
    "    return score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2435e398",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "correctness Score: 0.0\n"
     ]
    }
   ],
   "source": [
    "score = get_eval_results(\"correctness\", eval_results)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5f57c938",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "relevancy Score: 0.96\n"
     ]
    }
   ],
   "source": [
    "score = get_eval_results(\"relevancy\", eval_results)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
